Data Cleaning
Import Necessary Libraries
Read the dataset
Visualize the dataset
Find the trend and relation in between features
Convert text into numerical features
Display sparse matrix
Split dataset into training and testing set
Perform learning opertion - fit
Predict accuracy -Score
Validate result - Confusion Matrix, Classification Report
Repeat the process - Till the desirable validation result
from IPython.core.display import Image
Image(filename='C://Users//datta//Pictures//program_flow.jpg')
# Import All required packages
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.backends.backend_qt4agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure
from matplotlib.axes import Subplot
%matplotlib inline
import pandas as pd
import numpy as np
from numpy.random import randn
from scipy import stats
import requests
import seaborn as sns
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
import sklearn.ensemble as ske
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import permutation_test_score
import warnings
warnings.filterwarnings("ignore")
from ipywidgets import interactive
from IPython.display import Audio, display
import ipywidgets as widgets
from IPython.display import display, clear_output, Javascript
from traitlets import Unicode
# nbconvert related imports
from nbconvert import get_export_names, export_by_name
from nbconvert.writers import FilesWriter
from nbformat import read, NO_CONVERT
from nbconvert.utils.exceptions import ConversionException
notebook_name = widgets.Text()
js = """IPython.notebook.kernel.widget_manager.get_model('%s').then(function(model) {
model.set('value', IPython.notebook.notebook_name);
model.save();
});
""" % notebook_name.model_id
display(Javascript(data=js))
filename = notebook_name.value
filename
exporter_names = widgets.Dropdown(options=get_export_names(), value='html')
export_button = widgets.Button(description="Export")
download_link = widgets.HTML(visible=False)
taws_df = pd.read_excel('just_3.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
#smaller version
taws_df.count()
transformer = TfidfTransformer(smooth_idf=False)
transformer
corpus = taws_df['Merge']
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus).toarray()
print (X.shape)
X
vectorizer.get_feature_names()
causes = taws_df["CAUSE_LEVEL_1"].unique()
cause_dict = {value:index for index, value in enumerate(causes)}
y = taws_df["CAUSE_LEVEL_1"].map(cause_dict)
cause_dict
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.2)
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
clf_dt = tree.DecisionTreeClassifier(max_depth=10)
clf_dt.fit (X_train, y_train)
clf_dt.score (X_test, y_test)
y_test
y_pred = clf.predict(X_test)
shuffle_validator = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2)
def test_classifier(clf):
scores = cross_validation.cross_val_score(clf, X, y, cv=shuffle_validator)
print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))
cm = confusion_matrix(y_test, y_pred)
cm
plt.matshow(cm)
plt.title('Confusion matrix')
c = plt.summer()
plt.colorbar(c)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
target_names = ['PEOPLE', 'ENVIRONMENT', 'EQUIPMENT', 'ORGANISATION', 'UNSPECIFIED']
print(classification_report(y_test, y_pred, target_names=target_names))
test_classifier(clf)
sample_dataframe = taws_df.sample(n=200)
pwas_df = pd.read_excel('200ex.xlsx', 'Sheet1', index_col=None)
pwas_df.head()
n = 0
for n in range(200):
pm = pwas_df.ix[n]
vect_pm = vectorizer.transform(pm).toarray()
m = clf.predict(vect_pm)
print(m)
cons_df = pd.read_excel('machine_vs_human.xlsx', 'Sheet1', index_col=None)
m_test = cons_df['humnan']
m_pred = cons_df['machine']
cm2 = confusion_matrix(m_test, m_pred)
cm2
plt.matshow(cm2)
plt.title('Confusion matrix')
c = plt.summer()
plt.colorbar(c)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print(classification_report(m_test, m_pred, target_names=target_names))
This is very useful tool to share and convert the file into different file format. source
file_writer = FilesWriter()
def export(name, nb):
# Get a unique key for the notebook and set it in the resources object.
notebook_name = name[:name.rfind('.')]
resources = {}
resources['unique_key'] = notebook_name
resources['output_files_dir'] = '%s_files' % notebook_name
# Try to export
try:
output, resources = export_by_name(exporter_names.value, nb)
except ConversionException as e:
download_link.value = "<br>Could not export notebook!"
else:
write_results = file_writer.write(output, resources, notebook_name=notebook_name)
download_link.value = "<br>Results: <a href='files/{filename}'><i>\"{filename}\"</i></a>".format(filename=write_results)
download_link.visible = True
def handle_export(widget):
with open(filename, 'r') as f:
export(filename, read(f, NO_CONVERT))
export_button.on_click(handle_export)
display(exporter_names, export_button, download_link)